Library

packages

#install.packages("tidyverse")
library(tidyverse)
#BiocManager::install("infercnv")
library(infercnv)
#install.packages("Seurat")
library(Seurat)
#install.packages("ggplot2")
library(ggplot2)
#install.packages("kableExtra")
library(kableExtra)
#install.packages("data.table")
library(data.table)

Wilcoxon Test

plot-state: single normal sample from PC (BRCA1 vs TN)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
b17 <- read.table(file = "~/brca-infercnv/brca_output_dir_min_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))


#remove col names
b17 <- b17[2:length(rownames(b17)),]

#Amplifications
b17amp <- b17 %>% 
  mutate(type = gsub("\\..*","", b17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
t17 <- read.table(file = "~/brca-infercnv/tn_output_dir_subset_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
t17 <- t17[2:length(rownames(t17)),]

#Amplifications
t17amp <- t17 %>% 
  mutate(type = gsub("\\..*","", t17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#combine dfs
amp <- rbind(b17amp, t17amp)

#generate plot
plot <- ggplot(amp, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Amplifications",
    x = "BRCA1 Tumors (TN_B1) and TN tumors (TN)",
    y = "Sum of Amplifications"
  ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#output plot
plot

state: single wilcoxon test

#create separate b1 df
b1amp <- b17amp%>% mutate(genotype = `Sample Name`)
b1.names <- c("TN_B1_0131", "TN_B1_0177", "TN_B1_4031", "TN_B1_0554")
b1amp <- b1amp %>% filter(genotype %in% b1.names)
b1amp$genotype <- str_sub(b1amp$genotype, start = 1, end = 5)
b1amp$genotype <- sub("TN_B1", "BRCA1 Tumor", b1amp$genotype)

#create separate tn df
tnamp <- t17amp %>% mutate(genotype = `Sample Name`)
tn.names <- c("TN_0106", "TN_0126", "TN_0114", "TN_0135")
tnamp <- tnamp %>% filter(genotype %in% tn.names)
tnamp$genotype <- str_sub(tnamp$genotype, start = 1, end = 2)
tnamp$genotype <- sub("TN", "TN Tumor", tnamp$genotype)

#make numeric
b1amp$state <- as.numeric(b1amp$`Summed States`)
tnamp$state <- as.numeric(tnamp$`Summed States`)

#xilcox test for tumors
w.tum.st <- wilcox.test(b1amp$`Summed States`, tnamp$state, alternative = "two.sided")
w.tum.st #W = 21200, p-value < 2.2e-16
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  b1amp$`Summed States` and tnamp$state
## W = 24707, p-value = 6.729e-12
## alternative hypothesis: true location shift is not equal to 0

plot-length: single normal sample from PC (BRCA1 vs TN)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
b17 <- read.table(file = "~/brca-infercnv/tn_b1_epi_output_dir_subset_min/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
b17 <- b17[2:length(rownames(b17)),]

#Amplifications
b17amp <- b17 %>% 
  mutate(type = gsub("\\..*","", b17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
t17 <- read.table(file = "~/brca-infercnv/tn_output_dir_subset_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
t17 <- t17[2:length(rownames(t17)),]

#Amplifications
t17amp <- t17 %>% 
  mutate(type = gsub("\\..*","", t17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#combine dfs
amp <- rbind(b17amp, t17amp)

#violin plot
plot <- ggplot(amp, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Amplifications",
    x = "BRCA1 Tumors (TN_B1) and TN tumors (TN)",
    y = "Summed Length of Amplifications"
      ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#plot
plot

length: single wilcoxon test

#create separate b1 df
b1amp <- b17amp%>% mutate(genotype = `Sample Name`)
b1.names <- c("TN_B1_0131", "TN_B1_0177", "TN_B1_4031", "TN_B1_0554")
b1amp <- b1amp %>% filter(genotype %in% b1.names)
b1amp$genotype <- str_sub(b1amp$genotype, start = 1, end = 5)
b1amp$genotype <- sub("TN_B1", "BRCA1 Tumor", b1amp$genotype)

#create separate tn df
tnamp <- t17amp %>% mutate(genotype = `Sample Name`)
tn.names <- c("TN_0106", "TN_0126", "TN_0114", "TN_0135")
tnamp <- tnamp %>% filter(genotype %in% tn.names)
tnamp$genotype <- str_sub(tnamp$genotype, start = 1, end = 2)
tnamp$genotype <- sub("TN", "TN Tumor", tnamp$genotype)

#make numeric
b1amp$`Summed Length` <- as.numeric(b1amp$`Summed Length`)
tnamp$`Summed Length` <- as.numeric(tnamp$`Summed Length`)

#xilcox test for tumors
w.tum.ln <- wilcox.test(b1amp$`Summed Length`, tnamp$`Summed Length`, alternative = "two.sided")
w.tum.ln #W = 23113, p-value = 4.962e-14
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  b1amp$`Summed Length` and tnamp$`Summed Length`
## W = 35624, p-value = 0.01283
## alternative hypothesis: true location shift is not equal to 0

plot-state: multiple samples from PC (B1 vs pre-neoplastic)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

b17 <- read.table(file = "~/brca-infercnv/brca_output_dir_pc_multi/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
b17 <- b17[2:length(rownames(b17)),]

#Amplifications
b17amp <- b17 %>% 
  mutate(type = gsub("\\..*","", b17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#generate plot
bplot <- ggplot(b17amp, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Amplifications",
        x = "Preneoplastic tissue (B1) and BRCA1 tumors (TN_B1)",
    y = "Sum of Amplifications"
  ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#output plot
bplot

state: multiple wilcoxon test

#seperate B1 and PN
b1amp <- b17amp %>% mutate(genotype = `Sample Name`)
b1.names <- c("TN_B1_0131", "TN_B1_0177", "TN_B1_4031", "TN_B1_0554")
b1amp <- b1amp %>% filter(genotype %in% b1.names)
b1amp$genotype <- str_sub(b1amp$genotype, start = 1, end = 5)
b1amp$genotype <- sub("TN_B1", "BRCA1 Tumor", b1amp$genotype)

pnamp <- b17amp %>% mutate(genotype = `Sample Name`)
pn.names <- c("B1_0894", "B1_0033", "B1_0023", "B1_0090")
pnamp <- pnamp %>% filter(genotype %in% pn.names)
pnamp$genotype <- str_sub(pnamp$genotype, start = 1, end = 2)
pnamp <- pnamp %>% filter(genotype == "B1")
pnamp$genotype <- sub("B1", "BRCA1 Preneoplastic", pnamp$genotype)

#make numeric
b1amp$`Summed States` <- as.numeric(b1amp$`Summed States`)
pnamp$`Summed States` <- as.numeric(pnamp$`Summed States`)

#wilcox test for brca carriers
w.brca.st <- wilcox.test(b1amp$`Summed States`, pnamp$`Summed States`, alternative = "two.sided")
w.brca.st #W = 156334, p-value < 2.2e-16
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  b1amp$`Summed States` and pnamp$`Summed States`
## W = 316796, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

plot-length: multiple samples from PC (B1 vs pre-neoplastic)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
b17 <- read.table(file = "~/brca-infercnv/brca_output_dir_pc_multi/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
b17 <- b17[2:length(rownames(b17)),]

#Amplifications
b17amp <- b17 %>% 
  mutate(type = gsub("\\..*","", b17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#violin plot
bplot <- ggplot(b17amp, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Amplifications",
        x = "Preneoplastic tissue (B1) and BRCA1 tumors (TN_B1)",
    y = "Summed Length of Amplifications"
      ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#plot
bplot

length: multiple wilcoxon test

#seperate B1 and PN
#create separate b1 df
b1amp <- b17amp%>% mutate(genotype = `Sample Name`)
b1.names <- c("TN_B1_0131", "TN_B1_0177", "TN_B1_4031", "TN_B1_0554")
b1amp <- b1amp %>% filter(genotype %in% b1.names)
b1amp$genotype <- str_sub(b1amp$genotype, start = 1, end = 5)
b1amp$genotype <- sub("TN_B1", "BRCA1 Tumor", b1amp$genotype)

pnamp <- b17amp %>% mutate(genotype = `Sample Name`)
pn.names <- c("B1_0894", "B1_0033", "B1_0023", "B1_0090")
pnamp <- pnamp %>% filter(genotype %in% pn.names)
pnamp$genotype <- str_sub(pnamp$genotype, start = 1, end = 2)
pnamp <- pnamp %>% filter(genotype == "B1")
pnamp$genotype <- sub("B1", "BRCA1 Preneoplastic", pnamp$genotype)

#make numeric
b1amp$`Summed Length` <- as.numeric(b1amp$`Summed Length`)
pnamp$`Summed Length` <- as.numeric(pnamp$`Summed Length`)

#wilcox test for brca carriers
w.brca.ln <- wilcox.test(b1amp$`Summed Length`, pnamp$`Summed Length`, alternative = "two.sided")
w.brca.ln #W = 157272, p-value < 2.2e-16
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  b1amp$`Summed Length` and pnamp$`Summed Length`
## W = 305974, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

plot-state: multiple samples from PC (TN vs normal premenopausal)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
t17 <- read.table(file = "~/brca-infercnv/tp_subset_output_dir/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
t17 <- t17[2:length(rownames(t17)),]

t17amp <- t17 %>% 
  mutate(type = gsub("\\..*","", t17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#generate plot
tplot <- ggplot(t17amp, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Amplifications",
        x = "Premenopausal tissue (N) and TN Tumors (TN)",
    y = "Sum of Amplifications"
  ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#output plot
tplot

state: multiple wilcoxon test

#create separate tn df
tnamp <- t17amp %>% mutate(genotype = `Sample Name`)
tn.names <- c("TN_0106", "TN_0126", "TN_0114", "TN_0135")
tnamp <- tnamp %>% filter(genotype %in% tn.names)
tnamp$genotype <- str_sub(tnamp$genotype, start = 1, end = 2)
tnamp$genotype <- sub("TN", "TN Tumor", tnamp$genotype)

tpamp <- t17amp %>% mutate(genotype = `Sample Name`)
tp.names <- c("N_0019",  "N_0233",  "N_0092", "N_0093",  "N_0123",  "N_0064",  "N_0169")
tpamp <- tpamp %>% filter(genotype %in% tp.names)
tpamp$genotype <- str_sub(tpamp$genotype, start = 1, end = 1)
tpamp$genotype <- sub("N", "Human Premenopausal", tpamp$genotype)

#make numeric
tnamp$`Summed States` <- as.numeric(tnamp$`Summed States`)
tpamp$`Summed States` <- as.numeric(tpamp$`Summed States`)

#wilcox test for brca carriers
w.tn.st <- wilcox.test(tnamp$`Summed States`, tpamp$`Summed States`, alternative = "two.sided")
w.tn.st #W = 1126640457, p-value = 0.6612
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  tnamp$`Summed States` and tpamp$`Summed States`
## W = 40473, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

plot-length: multiple samples from PC (TN vs normal premenopausal)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
t17 <- read.table(file = "~/brca-infercnv/tp_subset_output_dir/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))


#remove col names
t17 <- t17[2:length(rownames(t17)),]

#Amplifications
t17amp <- t17 %>% 
  mutate(type = gsub("\\..*","", t17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#violin plot
tplot <- ggplot(t17amp, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Amplifications",
        x = "Premenopausal tissue (N) and TN Tumors (TN)",
    y = "Summed Length of Amplifications"
      ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#plot
tplot

length: multiple wilcoxon test

#tn df
tnamp <- t17amp %>% mutate(genotype = `Sample Name`)
tn.names <- c("TN_0106", "TN_0126", "TN_0114", "TN_0135")
tnamp <- tnamp %>% filter(genotype %in% tn.names)
tnamp$genotype <- str_sub(tnamp$genotype, start = 1, end = 2)
tnamp$genotype <- sub("TN", "TN Tumor", tnamp$genotype)

#tp df
tpamp <- t17amp %>% mutate(genotype = `Sample Name`)
tp.names <- c("N_0019",  "N_0233",  "N_0092", "N_0093",  "N_0123",  "N_0064",  "N_0169")
tpamp <- tpamp %>% filter(genotype %in% tp.names)
tpamp$genotype <- str_sub(tpamp$genotype, start = 1, end = 1)
tpamp$genotype <- sub("N", "Human Premenopausal", tpamp$genotype)

#make numeric
tnamp$`Summed Length` <- as.numeric(tnamp$`Summed Length`)
tpamp$`Summed Length` <- as.numeric(tpamp$`Summed Length`)

#wilcox test for brca carriers
w.tn.ln <- wilcox.test(tnamp$`Summed Length`, tpamp$`Summed Length`, alternative = "two.sided")
w.tn.ln #W = 1239947246, p-value < 2.2e-16
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  tnamp$`Summed Length` and tpamp$`Summed Length`
## W = 36669, p-value = 2.808e-15
## alternative hypothesis: true location shift is not equal to 0

plot-state: single normal sample from PC (Pre-neo vs Human Premeno)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#load data
pn17 <- read.table(file = "~/brca-infercnv/pn_output_dir_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
pn17 <- pn17[2:length(rownames(pn17)),]

#Amplifications
pn17amp <- pn17 %>% 
  mutate(type = gsub("\\..*","", pn17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>%
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>%
  distinct(cell_group_name, .keep_all = TRUE)

#load data
tp17 <- read.table(file = "~/brca-infercnv/tp_output_dir_05_13/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
tp17 <- tp17[2:length(rownames(tp17)),]

tp.names <- c("N_0019",  "N_0233",  "N_0092", "N_0093",  "N_0123",
              "N_0064",  "N_0169")

#Amplifications
tp17amp <- tp17 %>% 
  mutate(type = gsub("\\..*","", tp17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>%
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>% 
  filter(`Sample Name` %in% tp.names) %>%
  distinct(cell_group_name, .keep_all = TRUE)

#combine dfs
norm17amp <- rbind(pn17amp, tp17amp)

#generate plot
normplot <- ggplot(norm17amp, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Amplifications",
    x = "Preneoplastic Tissue (B1) and Premenopausal Tissue (N)",
    y = "Sum of Amplifications"
  ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#output plot
normplot

state: single wilcoxon test

#create separate preneo df
pnamp <- pn17amp %>% mutate(genotype = `Sample Name`)
pn.names <- c("B1_0894", "B1_0033", "B1_0023", "B1_0090")
pnamp <- pnamp %>% filter(genotype %in% pn.names)
pnamp$genotype <- str_sub(pnamp$genotype, start = 1, end = 2)
pnamp <- pnamp %>% filter(genotype == "B1")
pnamp$genotype <- sub("B1", "BRCA1 Preneoplastic", pnamp$genotype)

tpamp <- tp17amp %>% mutate(genotype = `Sample Name`)
tp.names <- c("N_0019",  "N_0233",  "N_0092", "N_0093",  "N_0123",
              "N_0064",  "N_0169")
tpamp <- tpamp %>% filter(genotype %in% tp.names)
tpamp$genotype <- str_sub(tpamp$genotype, start = 1, end = 1)
tpamp$genotype <- sub("N", "Human Premenopausal", tpamp$genotype)

#make numeric
pnamp$`Summed States` <- as.numeric(pnamp$`Summed States`)
tpamp$`Summed States` <- as.numeric(tpamp$`Summed States`)

#xilcox test for tumors
w.norm.st <- wilcox.test(pnamp$`Summed States`, tpamp$`Summed States`, alternative = "two.sided")
w.norm.st #W = 96990, p-value = 0.4548
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  pnamp$`Summed States` and tpamp$`Summed States`
## W = 132752, p-value = 0.1886
## alternative hypothesis: true location shift is not equal to 0

plot-length: single normal sample from PC (Preneo vs Human Premeno)

#extract data from brca1 preneo
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#load data
pn17 <- read.table(file = "~/brca-infercnv/pn_output_dir_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                   col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
pn17 <- pn17[2:length(rownames(pn17)),]

#Amplifications
pn17amp <- pn17 %>% 
  mutate(type = gsub("\\..*","", pn17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#extract data from tp meno
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#load data
tp17 <- read.table(file = "~/brca-infercnv/tp_output_dir_05_13/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
tp17 <- tp17[2:length(rownames(tp17)),]

#Amplifications
tp17amp <- tp17 %>% 
  mutate(type = gsub("\\..*","", tp17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE) %>%
  filter(`Sample Name` != "N_1105_epi")

#combine dfs
norm17amp <- rbind(pn17amp, tp17amp)

#violin plot
normplot <- ggplot(norm17amp, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Amplifications",
    x = "Preneoplastic Tissue (B1) and Premenopausal Tissue (N)",
    y = "Summed Length of Amplifications"
      ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#plot
normplot

length: single wilcoxon test

#create separate preneo df
pnamp <- pn17amp %>% mutate(genotype = `Sample Name`)
pn.names <- c("B1_0894", "B1_0033", "B1_0023", "B1_0090")
pnamp <- pnamp %>% filter(genotype %in% pn.names)
pnamp$genotype <- str_sub(pnamp$genotype, start = 1, end = 2)
pnamp <- pnamp %>% filter(genotype == "B1")
pnamp$genotype <- sub("B1", "BRCA1 Preneoplastic", pnamp$genotype)

tpamp <- tp17amp %>% mutate(genotype = `Sample Name`)
tp.names <- c("N_0019",  "N_0233",  "N_0092", "N_0093",  "N_0123",  "N_0064",  "N_0169")
tpamp <- tpamp %>% filter(genotype %in% tp.names)
tpamp$genotype <- str_sub(tpamp$genotype, start = 1, end = 1)
tpamp$genotype <- sub("N", "Human Premenopausal", tpamp$genotype)

#make numeric
pnamp$`Summed Length` <- as.numeric(pnamp$`Summed Length`)
tpamp$`Summed Length` <- as.numeric(tpamp$`Summed Length`)

#xilcox test for tumors
w.norm.ln <- wilcox.test(pnamp$`Summed Length`, tpamp$`Summed Length`, alternative = "two.sided")
w.norm.ln #W = 101848, p-value = 0.9706
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  pnamp$`Summed Length` and tpamp$`Summed Length`
## W = 112481, p-value = 7.063e-08
## alternative hypothesis: true location shift is not equal to 0

Wilcoxon results table

table of wilcoxon results

#combine each result
brca.st <- list(comparison = "BRCA1 Tumor vs Preneoplastic",
                statistic = w.brca.st$statistic,
                p.value = w.brca.st$p.value,
                event = "Amplification",
                value.type = "state")
brca.st <- as.data.frame(brca.st, row.names = NULL)

tum.st <- list(comparison = "BRCA1 Tumor vs TN tumor",
                statistic = w.tum.st$statistic,
                p.value = w.tum.st$p.value,
                event = "Amplification",
                value.type = "state")
tum.st <- as.data.frame(tum.st, row.names = NULL)

brca.ln <- list(comparison = "BRCA1 Tumor vs Preneoplastic",
                statistic = w.brca.ln$statistic,
                p.value = w.brca.ln$p.value,
                event = "Amplification",
                value.type = "length")
brca.ln <- as.data.frame(brca.ln, row.names = NULL)

tum.ln <- list(comparison = "BRCA1 Tumor vs TN tumor",
                statistic = w.tum.ln$statistic,
                p.value = w.tum.ln$p.value,
                event = "Amplification",
                value.type = "length")
tum.ln <- as.data.frame(tum.ln, row.names = NULL)

tn.st <- list(comparison = "TN Tumor vs Premenopausal",
                statistic = w.tn.st$statistic,
                p.value = w.tn.st$p.value,
                event = "Amplification",
                value.type = "state")
tn.st <- as.data.frame(tn.st, row.names = NULL)

tn.ln <- list(comparison = "TN Tumor vs Premenopausal",
                statistic = w.tn.ln$statistic,
                p.value = w.tn.ln$p.value,
                event = "Amplification",
                value.type = "length")
tn.ln <- as.data.frame(tn.ln, row.names = NULL)

norm.st <- list(comparison = "Preneoplastic vs Premenopausal",
                statistic = w.norm.st$statistic,
                p.value = w.norm.st$p.value,
                event = "Amplification",
                value.type = "state")
norm.st <- as.data.frame(norm.st, row.names = NULL)

norm.ln <- list(comparison = "Preneoplastic vs Premenopausal",
                statistic = w.norm.ln$statistic,
                p.value = w.norm.ln$p.value,
                event = "Amplification",
                value.type = "length")
norm.ln <- as.data.frame(norm.ln, row.names = NULL)

#set as datatable

#group results
wlx.df <- rbind(
  as.data.frame(brca.st, row.names = NULL),
  as.data.frame(brca.ln, row.names = NULL),
  as.data.frame(tum.st, row.names = NULL),              
  as.data.frame(tum.ln, row.names = NULL),
  as.data.frame(tn.st, row.names = NULL),
  as.data.frame(tn.ln, row.names = NULL),
  as.data.frame(norm.st, row.names = NULL),
  as.data.frame(norm.ln, row.names = NULL)
              )

#create table
wlx.df %>%
  kbl(caption = "Wilcoxon Rank Sum Test Results", digits = 10) %>%
  kable_classic(html_font = "Cambria") %>%
  kable_styling() 
Wilcoxon Rank Sum Test Results
comparison statistic p.value event value.type
BRCA1 Tumor vs Preneoplastic 316796.5 0.0000000000 Amplification state
BRCA1 Tumor vs Preneoplastic 305974.0 0.0000000000 Amplification length
BRCA1 Tumor vs TN tumor 24707.0 0.0000000000 Amplification state
BRCA1 Tumor vs TN tumor 35624.0 0.0128322515 Amplification length
TN Tumor vs Premenopausal 40473.0 0.0000000000 Amplification state
TN Tumor vs Premenopausal 36669.0 0.0000000000 Amplification length
Preneoplastic vs Premenopausal 132752.0 0.1885935736 Amplification state
Preneoplastic vs Premenopausal 112481.0 0.0000000706 Amplification length

Median fold change code (BRCA1 Tumor vs TN Tumor)

plot-state: single normal sample from PC (BRCA1 vs TN)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
b17 <- read.table(file = "~/brca-infercnv/tn_b1_epi_output_dir_subset_min/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
b17 <- b17[2:length(rownames(b17)),]

#Amplifications
b17amp <- b17 %>% 
  mutate(type = gsub("\\..*","", b17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#generate plot
bplot <- ggplot(b17amp, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Amplifications",
    x = "BRCA1 Tumors (TN_B1) and TN tumors (TN)",
    y = "Sum of Amplifications"
  ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#output plot
bplot

state: median fold change (use data from multi-sample brca1 run)

#seperate B1 and TN
b1amp <- b17amp %>% mutate(genotype = cell_group_name)
b1amp$genotype <- str_sub(b1amp$genotype, start = 1, end = 5)
b1amp <- b1amp %>% filter(genotype == "TN_B1") %>%
  mutate(genotype = "BRCA1 Tumor")

#seperate B1 and TN
tnamp <- b17amp %>% mutate(genotype = cell_group_name)
tnamp$genotype <- str_sub(tnamp$genotype, start = 1, end = 5)
tnamp <- tnamp %>% filter(genotype != "TN_B1")
tnamp$genotype <- str_sub(tnamp$genotype, start = 1, end = 2)
tnamp <- tnamp %>% filter(genotype == "TN") %>%
  mutate(genotype = "TN Tumor")

#make numeric
b1amp$`Summed States` <- as.numeric(b1amp$state)
tnamp$`Summed States` <- as.numeric(tnamp$state)

#new df
amp <- rbind(b1amp, tnamp)
amp <- amp %>% rename(sample = `Sample Name`)

# Aggregate sizes based on event type and Genotype for Amplifications in MB
mdn.smpl <- amp %>%  
  group_by(sample, genotype) %>%
  summarise(mb.amp = sum(state))

# Aggregate sizes based on event type and Genotype for Amplifications in MB
mdn.gntp <- amp %>%  
  group_by(genotype) %>%
  summarise(mb.amp = sum(state))

# Calculate the median fold change for Amplification events in genotypes
st.mdn.smpl <- mdn.smpl %>%
  group_by(sample, genotype) %>%
  summarise(`median value (amp)` = median(mb.amp)) %>%
  ungroup() %>%
  mutate(`median fold change (amp)` = `median value (amp)` / `median value (amp)`[genotype == "TN Tumor"])

#label as state
st.mdn.smpl <- st.mdn.smpl %>% mutate(type = "state")


#median fold change for genotypes
st.mdn.gntp <- mdn.gntp %>%
  group_by(genotype) %>%
  summarise(`median value (amp)` = median(mb.amp)) %>%
  ungroup() %>%
  mutate(`median fold change (amp)` = `median value (amp)` / `median value (amp)`[genotype == "TN Tumor"])

#label as state
st.mdn.gntp <- st.mdn.gntp %>% mutate(type = "state")

plot-length: single normal sample from PC (BRCA1 vs TN)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
b17 <- read.table(file = "~/brca-infercnv/tn_b1_epi_output_dir_subset_min/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
b17 <- b17[2:length(rownames(b17)),]

#Amplifications
b17amp <- b17 %>% 
  mutate(type = gsub("\\..*","", b17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE)

#violin plot
bplot <- ggplot(b17amp, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Amplifications",
    x = "BRCA1 Tumors (TN_B1) and TN tumors (TN)",
    y = "Summed Length of Amplifications"
      ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#plot
bplot

length: median fold change (use data from multi-sample brca1 run)

#seperate B1 and PN
b1amp <- b17amp %>% mutate(genotype = cell_group_name)
b1amp$genotype <- str_sub(b1amp$genotype, start = 1, end = 5)
b1amp <- b1amp %>% filter(genotype == "TN_B1") %>%
  mutate(genotype = "BRCA1 Tumor")

#seperate B1 and TN
tnamp <- b17amp %>% mutate(genotype = cell_group_name)
tnamp$genotype <- str_sub(tnamp$genotype, start = 1, end = 5)
tnamp <- tnamp %>% filter(genotype != "TN_B1")
tnamp$genotype <- str_sub(tnamp$genotype, start = 1, end = 2)
tnamp <- tnamp %>% filter(genotype == "TN") %>%
  mutate(genotype = "TN Tumor")


#make numeric
b1amp$`Summed Length` <- as.numeric(b1amp$Length)
tnamp$`Summed Length` <- as.numeric(tnamp$Length)

#new df
amp <- rbind(b1amp, tnamp)
amp <- amp %>% rename(sample = `Sample Name`)

# Aggregate sizes based on event type and Genotype for Amplifications in MB
mdn.smpl <- amp %>%  
  group_by(sample, genotype) %>%
  summarise(mb.amp = sum(Length))

# Aggregate sizes based on event type and Genotype for Amplifications in MB
mdn.gntp <- amp %>%  
  group_by(genotype) %>%
  summarise(mb.amp = sum(Length))

# Calculate the median fold change for Amplification events in genotypes
ln.mdn.smpl <- mdn.smpl %>%
  group_by(sample, genotype) %>%
  summarise(`median value (amp)` = median(mb.amp)) %>%
  ungroup() %>%
  mutate(`median fold change (amp)` = `median value (amp)` / `median value (amp)`[genotype == "TN Tumor"])

#label as length
ln.mdn.smpl <- ln.mdn.smpl %>% mutate(type = "length")

#median fold change for genotypes
ln.mdn.gntp <- mdn.gntp %>%
  group_by(genotype) %>%
  summarise(`median value (amp)` = median(mb.amp)) %>%
  ungroup() %>%
  mutate(`median fold change (amp)` = `median value (amp)` / `median value (amp)`[genotype == "TN Tumor"])

#label as length
ln.mdn.gntp <- ln.mdn.gntp %>% mutate(type = "length")

Median fold change results table(BRCA1 Tumor vs TN Tumor)

sample: table of median fold change (BRCA1 vs TN Tumor)

#combine dfs
mdn.smpl <- rbind(st.mdn.smpl, ln.mdn.smpl)

#create table
mdn.smpl %>%
  kbl(caption = "Median Fold Change (Amplification)", digits = 10) %>%
  kable_classic(html_font = "Cambria") %>%
  kable_styling(latex_options = "striped")
Median Fold Change (Amplification)
sample genotype median value (amp) median fold change (amp) type
TN_0106 TN Tumor 29.000000 1.000000 state
TN_0114 TN Tumor 74.000000 1.000000 state
TN_0126 TN Tumor 119.000000 1.000000 state
TN_0135 TN Tumor 48.000000 1.000000 state
TN_B1_0131 BRCA1 Tumor 497.000000 17.137931 state
TN_B1_0177 BRCA1 Tumor 1804.000000 24.378378 state
TN_B1_0554 BRCA1 Tumor 456.000000 3.831933 state
TN_B1_4031 BRCA1 Tumor 381.000000 7.937500 state
TN_0106 TN Tumor 0.271758 1.000000 length
TN_0114 TN Tumor 0.584745 1.000000 length
TN_0126 TN Tumor 1.302619 1.000000 length
TN_0135 TN Tumor 0.251491 1.000000 length
TN_B1_0131 BRCA1 Tumor 2.288344 8.420521 length
TN_B1_0177 BRCA1 Tumor 19.399088 33.175295 length
TN_B1_0554 BRCA1 Tumor 3.889770 2.986115 length
TN_B1_4031 BRCA1 Tumor 1.981174 7.877713 length

genotype: table of median fold change (BRCA1 vs TN Tumor)

#combine dfs
mdn.gntp <- rbind(st.mdn.gntp, ln.mdn.gntp)

#create table
mdn.gntp %>%
  kbl(caption = "Median Fold Change (Amplification)", digits = 10) %>%
  kable_classic(html_font = "Cambria") %>%
  kable_styling(latex_options = "striped")
Median Fold Change (Amplification)
genotype median value (amp) median fold change (amp) type
BRCA1 Tumor 3138.000000 11.62222 state
TN Tumor 270.000000 1.00000 state
BRCA1 Tumor 27.558376 11.43210 length
TN Tumor 2.410613 1.00000 length

Median fold change (BRCA1 Preneoplastic vs Human Premenopausal)

plot-state: single normal sample from PC (Pre-neo vs Human Premeno)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#load data
pn17 <- read.table(file = "~/brca-infercnv/pn_output_dir_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
pn17 <- pn17[2:length(rownames(pn17)),]

#Amplifications
pn17amp <- pn17 %>% 
  mutate(type = gsub("\\..*","", pn17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>%
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>%
  distinct(cell_group_name, .keep_all = TRUE)

#load data
tp17 <- read.table(file = "~/brca-infercnv/norm_output_dir_min/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                     col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
tp17 <- tp17[2:length(rownames(tp17)),]

#Amplifications
tp17amp <- tp17 %>% 
  mutate(type = gsub("\\..*","", tp17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(state = as.numeric(state)) %>%
  group_by(cell_group_name) %>%
  mutate(`Summed States` = sum(state)) %>%
  distinct(cell_group_name, .keep_all = TRUE) %>%
  filter(`Sample Name` != "N_1105")

#combine dfs
norm17amp <- rbind(pn17amp, tp17amp)

#generate plot
normplot <- ggplot(norm17amp, aes(x = `Sample Name`, y = `Summed States`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Amplifications",
    x = "Preneoplastic Tissue (B1) and Premenopausal Tissue (N)",
    y = "Sum of Amplifications"
  ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#output plot
normplot

state: median fold change (use data from multi-sample tn run)

#seperate B1 and PN
pnamp <- norm17amp %>% mutate(genotype = cell_group_name)
pnamp$genotype <- str_sub(pnamp$genotype, start = 1, end = 2)
pnamp <- pnamp %>% filter(genotype == "B1") %>%
  mutate(genotype = "BRCA1 preneoplastic")

tpamp <- norm17amp %>% mutate(genotype = cell_group_name)
tpamp <- tpamp %>% filter(`Sample Name` != "N_1105_epi")
tpamp$genotype <- str_sub(tpamp$genotype, start = 1, end = 1)
tpamp <- tpamp %>% filter(genotype == "N") %>%
  mutate(genotype = "premenopausal")

#make numeric
pnamp$`Summed States` <- as.numeric(pnamp$state)
tpamp$`Summed States` <- as.numeric(tpamp$state)

#new df
amp <- rbind(pnamp, tpamp)
amp <- amp %>% rename(sample = `Sample Name`)

# Aggregate sizes based on event type and Genotype for Amplifications in MB
mdn.smpl <- amp %>%  
  group_by(sample, genotype) %>%
  summarise(mb.amp = sum(state))

# Aggregate sizes based on event type and Genotype for Amplifications in MB
mdn.gntp <- amp %>%  
  group_by(genotype) %>%
  summarise(mb.amp = sum(state))

# Calculate the median fold change for Amplification events in genotypes
st.mdn.smpl <- mdn.smpl %>%
  group_by(sample, genotype) %>%
  summarise(`median value (amp)` = median(mb.amp)) %>%
  ungroup() %>%
  mutate(`median fold change (amp)` = `median value (amp)` / `median value (amp)`[genotype == "premenopausal"])

#label as state
st.mdn.smpl <- st.mdn.smpl %>% mutate(type = "state")

#median fold change for genotypes
st.mdn.gntp <- mdn.gntp %>%
  group_by(genotype) %>%
  summarise(`median value (amp)` = median(mb.amp)) %>%
  ungroup() %>%
  mutate(`median fold change (amp)` = `median value (amp)` / `median value (amp)`[genotype == "premenopausal"])

#label as state
st.mdn.gntp <- st.mdn.gntp %>% mutate(type = "state")

plot-length: single normal sample from PC (Preneo vs Human Premeno)

#extract data from brca
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

#load data
pn17 <- read.table(file = "~/brca-infercnv/pn_output_dir_1_norm/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                   col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
pn17 <- pn17[2:length(rownames(pn17)),]

#Amplifications
pn17amp <- pn17 %>% 
  mutate(type = gsub("\\..*","", pn17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE) 

#load data
tp17 <- read.table(file = "~/brca-infercnv/norm_output_dir_min/17_HMM_predHMMi6.leiden.hmm_mode-subclusters.pred_cnv_genes.dat",
                   col.names = c("cell_group_name",  "gene_region_name", "state", "gene", "chr", "start", "end"))

#remove col names
tp17 <- tp17[2:length(rownames(tp17)),]

#Amplifications
tp17amp <- tp17 %>% 
  mutate(type = gsub("\\..*","", tp17$cell_group_name)) %>% 
  filter(state > 3) %>%
  rename(`Sample Name` = type) %>%
  mutate(Length = (as.numeric(end) - as.numeric(start)) / 1e6) %>% 
  group_by(cell_group_name) %>%
  mutate(`Summed Length` = sum(Length)) %>% 
  distinct(cell_group_name, .keep_all = TRUE) %>%
  filter(`Sample Name` != "N_1105")

#combine dfs
norm17amp <- rbind(pn17amp, tp17amp)

#violin plot
normplot <- ggplot(norm17amp, aes(x = `Sample Name`, y = `Summed Length`, fill = `Sample Name`)) +
  geom_violin(scale = "width", adjust = 1.5) + theme(axis.text.x = element_text(angle =45, hjust = 1)) +
  geom_point(size=0.1, position = "jitter", width = 0.2, size = 2, alpha = 0.7, shape = 21, color = "black") +
  labs(
    title = "Violin Plot of Copy Number Amplifications",
    x = "Preneoplastic Tissue (B1) and Premenopausal Tissue (N)",
    y = "Summed Length of Amplifications"
      ) +
  scale_fill_brewer(palette = "Set3") +
  theme_minimal(base_size = 15) +
  theme(legend.position = "none",
        panel.border = element_rect(fill = NA, color = "black", linewidth = 1.5),
        strip.background = element_rect(fill = "grey90", color = "black", size = 1.5),
        panel.spacing = unit(1, "lines"),
        plot.title = element_text(face = "bold", hjust = 0.5),
        axis.title.x = element_text(face = "bold"),
        axis.title.y = element_text(face = "bold"),
        axis.text.x = element_text(angle = 45, hjust = 1))

#plot
normplot

length: median fold change (use data from multi-sample tn run)

#seperate TP and PN
pnamp <- norm17amp %>% mutate(genotype = cell_group_name)
pnamp$genotype <- str_sub(pnamp$genotype, start = 1, end = 2)
pnamp <- pnamp %>% filter(genotype == "B1") %>%
  mutate(genotype = "BRCA1 preneoplastic")

#seperate TP and PN
tpamp <- norm17amp %>% mutate(genotype = cell_group_name)
tpamp <- tpamp %>% filter(`Sample Name` != "N_1105")
tpamp$genotype <- str_sub(tpamp$genotype, start = 1, end = 1)
tpamp <- tpamp %>% filter(genotype == "N") %>%
  mutate(genotype = "premenopausal")

#make numeric
pnamp$`Summed Length` <- as.numeric(pnamp$Length)
tpamp$`Summed Length` <- as.numeric(tpamp$Length)

#new df
amp <- rbind(pnamp, tpamp)
amp <- amp %>% rename(sample = `Sample Name`)

# Aggregate sizes based on event type and Genotype for Amplifications in MB
mdn.smpl <- amp %>%  
  group_by(sample, genotype) %>%
  summarise(mb.amp = sum(Length))

# Aggregate sizes based on event type and Genotype for Amplifications in MB
mdn.gntp <- amp %>%  
  group_by(genotype) %>%
  summarise(mb.amp = sum(Length))

# Calculate the median fold change for Amplification events in genotypes
ln.mdn.smpl <- mdn.smpl %>%
  group_by(sample, genotype) %>%
  summarise(`median value (amp)` = median(mb.amp)) %>%
  ungroup() %>%
  mutate(`median fold change (amp)` = `median value (amp)` / `median value (amp)`[genotype == "premenopausal"])

#label as state
ln.mdn.smpl <- ln.mdn.smpl %>% mutate(type = "length")

#median fold change for genotypes
ln.mdn.gntp <- mdn.gntp %>%
  group_by(genotype) %>%
  summarise(`median value (amp)` = median(mb.amp)) %>%
  ungroup() %>%
  mutate(`median fold change (amp)` = `median value (amp)` / `median value (amp)`[genotype == "premenopausal"])

#label as state
ln.mdn.gntp <- ln.mdn.gntp %>% mutate(type = "length")

Median fold change results table (BRCA1 Preneoplastic vs Human Premenopausal)

sample: table of median fold change (Pre-neo vs Premenopausal)

#combine dfs
mdn.smpl <- rbind(st.mdn.smpl, ln.mdn.smpl)

#create table
mdn.smpl %>%
  kbl(caption = "Median Fold Change (Amplification)", digits = 10) %>%
  kable_classic(html_font = "Cambria") %>%
  kable_styling(latex_options = "striped")
Median Fold Change (Amplification)
sample genotype median value (amp) median fold change (amp) type
B1_0023 BRCA1 preneoplastic 768.000000 1.6271186 state
B1_0033 BRCA1 preneoplastic 1036.000000 18.5000000 state
B1_0090 BRCA1 preneoplastic 688.000000 2.2337662 state
B1_0894 BRCA1 preneoplastic 1217.000000 3.2367021 state
N_0019 premenopausal 472.000000 0.7801653 state
N_0064 premenopausal 56.000000 0.2074074 state
N_0092 premenopausal 308.000000 1.5714286 state
N_0093 premenopausal 376.000000 1.2207792 state
N_0123 premenopausal 605.000000 1.2817797 state
N_0169 premenopausal 270.000000 4.8214286 state
N_0230 premenopausal 196.000000 0.6363636 state
N_0233 premenopausal 308.000000 0.8191489 state
B1_0023 BRCA1 preneoplastic 11.832993 1.4229113 length
B1_0033 BRCA1 preneoplastic 8.696778 8.3180171 length
B1_0090 BRCA1 preneoplastic 8.279110 4.5324554 length
B1_0894 BRCA1 preneoplastic 19.695036 3.4807972 length
N_0019 premenopausal 8.316044 0.8064670 length
N_0064 premenopausal 1.045535 0.2104788 length
N_0092 premenopausal 1.826628 0.8915064 length
N_0093 premenopausal 5.658197 1.2377085 length
N_0123 premenopausal 10.311698 1.6163147 length
N_0169 premenopausal 4.967413 0.5973289 length
N_0230 premenopausal 2.048923 1.9596886 length
N_0233 premenopausal 4.571510 2.5027044 length
N_1105_epi premenopausal 6.379759 1.1275251 length

genotype: table of median fold change (Pre-neo vs Premenopausal)

#combine dfs
mdn.gntp <- rbind(st.mdn.gntp, ln.mdn.gntp)

#create table
mdn.gntp %>%
  kbl(caption = "Median Fold Change (Amplification)", digits = 10) %>%
  kable_classic(html_font = "Cambria") %>%
  kable_styling(latex_options = "striped")
Median Fold Change (Amplification)
genotype median value (amp) median fold change (amp) type
BRCA1 preneoplastic 3709.00000 1.431494 state
premenopausal 2591.00000 1.000000 state
BRCA1 preneoplastic 48.50392 1.074862 length
premenopausal 45.12571 1.000000 length